# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #  
#
#' @title  Create train and test splits for classifier training on pooled
#'          sample of crowd-coded tweets
#' @author Hauke Licht
#
# +~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~+~ #

# setup ----

# load packages
library(readr)
library(dplyr)
library(tidyr)
library(purrr)

base_path <- file.path(".")
data_path <- file.path(base_path, "data")
fits_path <- file.path(data_path, "fits")
labelings_path <- file.path(data_path, "intermediate", "labelings")

# load labelings induced for tweets in pooled sample ----

fp <- file.path(labelings_path, "dawidskene_labelings_pooled_samples.csv")
labeled_tweets <- read_csv(fp, col_types = "cciccccdiccddiddddcc")

table(labeled_tweets$sample)

# remove any duplicates
dupl_status_ids <- labeled_tweets$status_id[duplicated(labeled_tweets$status_id)]

# note: because *Sinn Fein* uses the same account in the UK and in Ireland, there are duplicate tweets
labeled_tweets %>% 
  filter(status_id %in% dupl_status_ids) %>% 
  arrange(status_id) %>% 
  select(status_id, text_en, party_name_short, country_iso3c, labeling)

# we remove these to prevent data leakage
discard_these_tweets <- labeled_tweets %>% 
  # just discard those from the British context (labelings are consistent anyways)
  filter(status_id %in% dupl_status_ids, country_iso3c == "GBR") %>% 
  pull(item_id)

# left-join original texts to labeled tweets ----

# load tweets with "political" classifications (has twee)
tweets <- read_rds(file.path(data_path, "input", "all_tweets_classified_political.rds")) 
# discard tweets collected post hoc
tweets <- filter(tweets, collected_posthoc == "no")

key_cols <- c("item_id", "country_iso3c", "party_id", "party_name_short", "user_id", "status_id")
text_cols <- c("text", "text_en")
meta_cols <- c("sample", "cluster_id", "prob_political", "n_judgments")

model_matrix <- labeled_tweets %>% 
  filter(!(item_id %in% discard_these_tweets)) %>% 
  left_join(
    distinct(select(tweets, country_iso3c, party_id, user_id, status_id, text, political))
    , by = c("country_iso3c", "party_id", "user_id", "status_id")
  ) %>% 
  select(!!key_cols, !!text_cols, labeling, !!meta_cols, political)

# verify  
table(is.na(model_matrix$text))
table(is.na(model_matrix$text_en))
table(is.na(model_matrix$labeling))
table(is.na(model_matrix$political))
table(model_matrix$political)

# sample into train and test set ----

set.seed(1234)
data_splits <- model_matrix %>% 
  # drop all "Unsure" labels
  filter(labeling != "yes-unsure") %>% 
  group_by(country_iso3c, labeling) %>% 
  mutate(test_ = sample(c(T, F), size = n(), replace = TRUE, prob = c(.2, .8))) %>%
  ungroup() 

# ensure that no tweet contained in both test and training sets
data_splits %>% 
  group_by(status_id) %>% 
  filter(n_distinct(test_) > 1) %>% 
  nrow()

# inspect
with(data_splits, table(labeling, test_))

fp <- file.path(data_path, "intermediate", "training", "training_data_pooled_samples.csv")
if (!file.exists(fp) & FALSE)
  # CAVEAT: The original train--test assignment used in the paper cannot be reproduced
  #         So we don't overwrite the file
  save_csv(data, fp)
